# ========================================================================
# ====  kernel estimators for nonparametric calculation of gradient information ====
# ========================================================================
#install.packages('ks')
library('ks')

#fhat <- kde(x=cbind(a,b), eval.points = dat) #note: the eval.points can be either a grid (default) or observed data
#plot(fhat, display="filled.contour", drawpoints=TRUE, lwd=1)# xlim=c(0,20), ylim=c(0,80))
#fhat$eval.points 
#fhat$estimate

#fD1hat <- kdde(x=cbind(a,b),  deriv.order = 1)
#plot(fD1hat, display="filled.contour", drawpoints=TRUE, lwd=1)# xlim=c(0,20), ylim=c(0,80))
#fD1hat$eval.points 
#fD1hat$estimate

H_v2 <- function(X, type){
  d <- ncol(X)
  fhat <- kde(x=X, eval.points=X)

  if (type == 'S'){
    H <- mean(-log(fhat$estimate ))
    
  }else if (type == 'F'){
    fD1hat <- kdde(x=X,  deriv.order = 1, eval.points=X)
    
    H <- 0
    for (j in 1:d){
      H <- H  - 0.5 * mean(matrix(fD1hat$estimate,ncol=d)[,j]^2/fhat$estimate^2)
    }
    
  }else{
    stop("error in type in func. H")
  }
  
  H
}


MI_v2 <- function(X, ind1, ind2, type){
  #Gaussian assp.
  if (type == 'SG'){
    rho <- cor(X[,c(ind1, ind2)])[1,2]
    mi <- - 0.5 * log(1-rho^2)  
  }else if (type == 'FG'){
    rho <- cor(X[,c(ind1, ind2)])[1,2]
    mi <- 0.5 * rho^2 / (1-rho^2) * (1/var(X[,ind1]) + 1/var(X[,ind2]))
  }else if (type == 'S' || type == 'F'){
    mi <- H_v2(X[,ind1,drop=FALSE], type) + H_v2(X[,ind1,drop=FALSE], type) - H_v2(X[,c(ind1,ind2)], type)
    # mi <- max(mi, 0)
  }else{
    stop("error in type in func. MI!")
  }
  mi
}

#H_v2(dat, 'S')
#MI_v2(dat, 1, 2, 'F')



# ==== Define kernel estimators ====

# we use gaussian kernel, dimension d
# consider x (1 x d), mu vector (n x d samples), d=1,2
#return the scalar of density at x 
K_KER <- function(x,mu){
  n <- nrow(mu)
  d <- length(x)
  hs <- rep(NA, d)
#  h <- sd(mu) * n^(-1/(d+4)) #should use different bandwidth
  for (k in 1:d){
    hs[k] <- sd(mu[,k]) * n^(-1/(d+4)) #bandwidth
    if (k==1){
      s <- (2*pi)^(-1/2) / hs[k] * exp(-((x[k]-mu[,k])/hs[k])^2/2)
    }else{
      s <- s * (2*pi)^(-1/2) / hs[k] * exp(-((x[k]-mu[,k])/hs[k])^2/2)
    }
  }
  k_ker <- sum(s) / n
  k_ker
}

# K_KER_D1 <- function(x,mu,h){
#   s <- (2*pi)^(-0.5)*exp(-((x-mu)/h)^2/2) * (-(x-mu)/h^2)
#   n <- length(mu)
#   k_ker <- sum(s) / (n * h)
# }
#  
# K_KER_D2 <- function(x,mu,h){
#   s <- (2*pi)^(-0.5)*exp(-((x-mu)/h)^2/2) * (((x-mu)/h^2)^2 - 1/h^2)
#   n <- length(mu)
#   k_ker <- sum(s) / (n * h)
# }

#return the vector gradient at x
K_KER_D1 <- function(x,mu){
  n <- nrow(mu)
  d <- length(x)
  # h <- sd(mu) * n^(-1/(d+4)) #bandwidth
  hs <- rep(NA, d)
  for (k in 1:d){
    hs[k] <- sd(mu[,k]) * n^(-1/(d+4)) #bandwidth
    if (k==1){
      s <- (2*pi)^(-1/2) / hs[k] * exp(-((x[k]-mu[,k])/hs[k])^2/2)
    }else{
      s <- s * (2*pi)^(-1/2) / hs[k] * exp(-((x[k]-mu[,k])/hs[k])^2/2)
    }
  }
  # define the derivative at each dimension k
  n <- nrow(mu)
  k_ker_d1 <- rep(NA, d)
  for (k in 1:d){
    sk <- s * (-(x[k]-mu[,k])/hs[k]^2)
    k_ker_d1[k] <- sum(sk) / n
  }
  k_ker_d1
}

#return the diag of second order deriv. of density at x 
#NOTE! if we need this function, need to re-do the h for each dimension
# K_KER_D2 <- function(x,mu){
#   d <- length(x)
#   h <- sd(mu) * n^(-1/(d+4)) #bandwidth
#   for (k in 1:d){
#     if (k==1){
#       s <- exp(-((x[k]-mu[,k])/h)^2/2)
#     }else{
#       s <- s * exp(-((x[k]-mu[,k])/h)^2/2)
#     }
#   }
#   s <- (2*pi)^(-d/2) * h^(-d) * s 
#   # define the derivative at each dimension k
#   n <- nrow(mu)
#   k_ker_d2 <- rep(NA, d)
#   for (k in 1:d){
#     sk <- s * (((x[k]-mu[,k])/h^2)^2 - 1/h^2)
#     k_ker_d2[k] <- sum(sk) / n
#   }
#   k_ker_d2
# }

#test the code is corret - verified
# library('mvtnorm')
# #1D
# K_KER(0,matrix(rnorm(1000),ncol=1)) #= 1/sqrt(2*pi)
# #2D
# x <- matrix(rep(0.5,2), ncol=2)
# mu <- matrix(rnorm(100000), ncol=2)
# K_KER(x,mu) #= 1/(2*pi)
# K_KER_D1(x,mu) #= dmvnorm(x, log=FALSE)*0.5
# K_KER_D2(x,mu) #= dmvnorm(x, log=FALSE)*-0.75


# ==== Calculate two types of entropy ====
#X is nxp data 
#sample: 
#X <- matrix(rnorm(1000), ncol=2) 
#H(X, 'F')
H <- function(X, type){
  n <- nrow(X)
  est_dens <- rep(0,n)
  if (type == 'S'){
    for (j in 1:n){
      est_dens[j] <- K_KER(X[j,],X)
    }
    H <- mean(-log(est_dens))
  }else if (type == 'F'){
    est_dens_D1 <- rep(0,n)
    # est_dens_D2 <- rep(0,n)
    for (j in 1:n){
      est_dens[j] <- K_KER(X[j,],X)
      est_dens_D1[j] <- sum(K_KER_D1(X[j,],X)^2)
      # est_dens_D2[j] <- K_KER_D2(X[j,],X)
    }
    H <- - 0.5 * mean((est_dens_D1/est_dens^2))
  }else{
    stop("error in type in func. H")
  }
  
  H
}

  

# ==== Calculate two types of mutual ====
#X1, X2 are nx1 data from n x p matrix X
#sample: 
#X <- matrix(rnorm(10000), ncol=2) %*% matrix(c(1,0.5,0.5,1), nrow=2)
#H(X,'S') #log(2*pi*exp(1))+0.5*log(0.75)
#MI(X, 1, 2, 'S') 
#MI(X, 1, 2, 'F') 
MI <- function(X, ind1, ind2, type){
  #Gaussian assp.
  if (type == 'SG'){
    rho <- cor(X[,c(ind1, ind2)])[1,2]
    mi <- - 0.5 * log(1-rho^2)  
  }else if (type == 'FG'){
    rho <- cor(X[,c(ind1, ind2)])[1,2]
    mi <- 0.5 * rho^2 / (1-rho^2) * (1/var(X[,ind1]) + 1/var(X[,ind2]))
  }else if (type == 'S' || type == 'F'){
    mi <- H(X[,ind1,drop=FALSE], type) + H(X[,ind1,drop=FALSE], type) - H(X[,c(ind1,ind2)], type)
    # mi <- max(mi, 0)
  }else{
    stop("error in type in func. MI!")
  }
  mi
}



# ====================================== Test both entropies ==================================
# # key params
# Iter <- 10;
# n <- 200;
# h <- n^(-0.3) #bandwidth
# 
# # My estimator
# est_L_entro <- rep(0,Iter)
# est_H_entro <- rep(0,Iter)
# 
# 
# for (iter in 1:Iter){
#   y <- rnorm(n,0,1)
#   y <- y^3
#   est_dens <- rep(0,n)
#   est_dens_D1 <- rep(0,n)
#   est_dens_D2 <- rep(0,n)
#   
#   for (j in 1:n){
#     est_dens[j] <- K_KER(y[j],y,h)
#     est_dens_D1[j] <- K_KER_D1(y[j],y,h)
#     est_dens_D2[j] <- K_KER_D2(y[j],y,h)
#   }
# 
# 
# est_L_entro[iter] <- mean(-log(est_dens));
# #way 1: use D2
# #     est_H_entro(iter) <- mean((est_dens_D2./est_dens - 0.5* (est_dens_D1./est_dens).^2));
# #way 2: use D1 only 
# est_H_entro[iter] <- -0.5 * mean((est_dens_D1/est_dens)^2);
#  
# }
# 
# hist(est_L_entro, 50)
# hist(est_H_entro, 50)

  
   
  
  